#!/bin/bash
read log_file MODEL_NAME CURRENT_DIR YAML_FILE < <(bash setup_env.sh) 
echo "log file: $log_file" "$MODEL_NAME" "$CURRENT_DIR" "$YAML_FILE"
# ----------------------------------------------------------------------------------------
export NCCL_SOCKET_IFNAME=$(ifconfig -a | grep -oP '^\S+' | sed 's/://g' | grep -E '^ib.*$' | head -n 1)  # ibs2, ib0
export NCCL_DEBUG=INFO

NPROC_PER_NODE=16
NNODES=2
RANK=0 #主机为0 ，其它机器为1  # <-------
MASTER_ADDR="10.31.10.130" # 主机
MASTER_PORT=29505

echo "NCCL_SOCKET_IFNAME: $NCCL_SOCKET_IFNAME"  | tee -a $log_file
cd ../..
python3 -m torch.distributed.run \
    --nproc_per_node $NPROC_PER_NODE \
    --nnodes $NNODES \
    --node_rank $RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT \
    src/train.py $YAML_FILE \
    2>&1 | tee -a $log_file

echo "log saved: $log_file" | tee -a $log_file
echo "------EOF" | tee -a $log_file

rm -rf ./saves/$MODEL_NAME  # 删除save_checkpoint，节省空间
